{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1 相似度计算\n",
    "\n",
    "狭义的协同过滤即基于用户的喜好来自动实现用户的喜好判断。比如：A、B两位用户对某个事物相同的观点，那么A用户在其他事物的观点相比于随机挑选的一个人而言会更加接近B用户。广义的协同过滤，数据源更加广泛。之所以叫协同过滤，是因为在实现过滤推荐的时候是根据其他人的行为来做出预测的。\n",
    "\n",
    "第一步是如何评价两位用户之间的相似度。\n",
    "\n",
    "**注**: 电影评分数据源请点击[movielens](https://grouplens.org/datasets/movielens/20m/)去下载。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from IPython.display import Latex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies = pd.read_csv('./ml-20m/movies.csv')\n",
    "ratings = pd.read_csv('./ml-20m/ratings.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对数据进行相加，通过movieId关联。\n",
    "data = pd.merge(movies, ratings, on = \"movieId\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存入文本，方便后期处理（这一步比较耗时）\n",
    "# data[['userId', 'rating', 'movieId', 'title']].sort_values('userId').to_csv('./data.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>rating</th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>11</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>12</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>14</td>\n",
       "      <td>4.5</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>16</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>19</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  rating  movieId             title\n",
       "0       3     4.0        1  Toy Story (1995)\n",
       "1       6     5.0        1  Toy Story (1995)\n",
       "2       8     4.0        1  Toy Story (1995)\n",
       "3      10     4.0        1  Toy Story (1995)\n",
       "4      11     4.5        1  Toy Story (1995)\n",
       "5      12     4.0        1  Toy Story (1995)\n",
       "6      13     4.0        1  Toy Story (1995)\n",
       "7      14     4.5        1  Toy Story (1995)\n",
       "8      16     3.0        1  Toy Story (1995)\n",
       "9      19     5.0        1  Toy Story (1995)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[['userId', 'rating', 'movieId', 'title']][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#打开文件\n",
    "with open('./data.csv') as file:\n",
    "    data = {}\n",
    "    for line in file.readlines()[1:1000]:\n",
    "        line = line.strip().split(\",\")\n",
    "        if not line[0] in data.keys():\n",
    "            data[line[0]] = {line[3]:line[1]}\n",
    "        else:\n",
    "            data[line[0]][line[3]] = line[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{\"Monty Python's The Meaning of Life (1983)\": '3.5',\n",
       " 'Kill Bill: Vol. 2 (2004)': '4.0',\n",
       " '\"Ring': '3.5',\n",
       " 'Shrek (2001)': '4.0',\n",
       " 'Contact (1997)': '3.5',\n",
       " 'Die Hard (1988)': '4.0',\n",
       " 'Rumble in the Bronx (Hont faan kui) (1995)': '3.5',\n",
       " 'One Million Years B.C. (1966)': '4.0',\n",
       " '\"Mask': '3.5',\n",
       " 'Dawn of the Dead (1978)': '3.5',\n",
       " 'Freaks (1932)': '5.0',\n",
       " 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)': '3.5',\n",
       " 'Ringu (Ring) (1998)': '3.5',\n",
       " 'Escape to Witch Mountain (1975)': '3.5',\n",
       " 'Videodrome (1983)': '4.0',\n",
       " 'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)': '4.0',\n",
       " 'Interview with the Vampire: The Vampire Chronicles (1994)': '4.0',\n",
       " 'Star Trek II: The Wrath of Khan (1982)': '4.0',\n",
       " \"Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)\": '4.0',\n",
       " 'Highlander: Endgame (Highlander IV) (2000)': '4.0',\n",
       " 'Seven (a.k.a. Se7en) (1995)': '3.5',\n",
       " '\"Lock': '4.0',\n",
       " '2001: A Space Odyssey (1968)': '3.5',\n",
       " 'Memento (2000)': '3.5',\n",
       " '\"Wizard of Oz': '3.5',\n",
       " '\"Incredibles': '4.0',\n",
       " 'Butch Cassidy and the Sundance Kid (1969)': '3.0',\n",
       " 'Clash of the Titans (1981)': '4.0',\n",
       " 'Star Wars: Episode I - The Phantom Menace (1999)': '4.0',\n",
       " 'Wilder Napalm (1993)': '3.5',\n",
       " 'Star Wars: Episode IV - A New Hope (1977)': '4.0',\n",
       " '\"Fish Called Wanda': '4.0',\n",
       " \"Monty Python's Life of Brian (1979)\": '3.5',\n",
       " '\"Dead Zone': '4.0',\n",
       " 'Final Fantasy: The Spirits Within (2001)': '3.5',\n",
       " 'Vampire Hunter D: Bloodlust (Banpaia hantâ D) (2000)': '3.5',\n",
       " '\"Sword and the Sorcerer': '3.5',\n",
       " 'Watership Down (1978)': '4.0',\n",
       " '\"Dark Crystal': '4.0',\n",
       " 'Legend (1985)': '4.0',\n",
       " 'Donnie Darko (2001)': '3.5',\n",
       " 'Monty Python and the Holy Grail (1975)': '3.5',\n",
       " '\"Navigator: A Mediaeval Odyssey': '4.0',\n",
       " 'Beetlejuice (1988)': '4.0',\n",
       " 'Willow (1988)': '4.0',\n",
       " '\"Untouchables': '3.5',\n",
       " '\"Picture of Dorian Gray': '3.5',\n",
       " 'Harry Potter and the Chamber of Secrets (2002)': '4.0',\n",
       " 'E.T. the Extra-Terrestrial (1982)': '4.0',\n",
       " '\"Thing': '4.0',\n",
       " '\"Company of Wolves': '4.0',\n",
       " 'Edward Scissorhands (1990)': '4.0',\n",
       " 'Peter Pan (2003)': '3.5',\n",
       " 'Jaws (1975)': '4.0',\n",
       " '\"City of Lost Children': '3.5',\n",
       " 'Underworld (2003)': '4.0',\n",
       " '\"Time Machine': '4.0',\n",
       " 'Platoon (1986)': '4.0',\n",
       " 'Bubba Ho-tep (2002)': '3.5',\n",
       " 'Reservoir Dogs (1992)': '3.5',\n",
       " 'Indiana Jones and the Last Crusade (1989)': '3.5',\n",
       " 'Toys (1992)': '3.5',\n",
       " \"One Flew Over the Cuckoo's Nest (1975)\": '3.5',\n",
       " 'Scary Movie 3 (2003)': '3.0',\n",
       " 'Frankenstein (1931)': '3.5',\n",
       " '\"Nosferatu (Nosferatu': '3.5',\n",
       " '\"Omen': '3.5',\n",
       " 'Princess Mononoke (Mononoke-hime) (1997)': '3.5',\n",
       " 'Jason and the Argonauts (1963)': '4.0',\n",
       " 'Snatch (2000)': '4.0',\n",
       " 'Yojimbo (1961)': '3.0',\n",
       " 'Little Big Man (1970)': '3.5',\n",
       " '\"Brotherhood of the Wolf (Pacte des loups': '4.0',\n",
       " '\"Witches': '3.5',\n",
       " '\"Witches of Eastwick': '4.0',\n",
       " 'Sleepy Hollow (1999)': '4.0',\n",
       " 'Dungeons & Dragons (2000)': '3.5',\n",
       " '\"Crouching Tiger': '4.0',\n",
       " '\"Beastmaster': '3.0',\n",
       " 'Dragonslayer (1981)': '4.0',\n",
       " 'Spider-Man 2 (2004)': '4.5',\n",
       " '\"Silence of the Lambs': '3.5',\n",
       " 'Teenage Mutant Ninja Turtles (1990)': '3.5',\n",
       " '\"7th Voyage of Sinbad': '4.0',\n",
       " '\"Lord of the Rings: The Return of the King': '5.0',\n",
       " 'X2: X-Men United (2003)': '4.0',\n",
       " 'Sling Blade (1996)': '4.0',\n",
       " 'Constantine (2005)': '4.0',\n",
       " '\"Invisible Man': '3.0',\n",
       " 'Terminator 2: Judgment Day (1991)': '3.5',\n",
       " 'Flash Gordon (1980)': '3.5',\n",
       " 'Hard-Boiled (Lat sau san taam) (1992)': '3.5',\n",
       " 'Die Hard 2 (1990)': '3.0',\n",
       " 'Conan the Destroyer (1984)': '3.0',\n",
       " 'Dracula (1931)': '3.5',\n",
       " 'Time Bandits (1981)': '4.0',\n",
       " \"Jacob's Ladder (1990)\": '3.5',\n",
       " 'Invasion of the Body Snatchers (1956)': '3.5',\n",
       " 'Austin Powers: The Spy Who Shagged Me (1999)': '3.5',\n",
       " 'Run Lola Run (Lola rennt) (1998)': '3.5',\n",
       " 'Masters of the Universe (1987)': '3.0',\n",
       " '\"Lost Boys': '4.0',\n",
       " '\"Evil Dead': '3.5',\n",
       " 'Warriors of Virtue (1997)': '3.0',\n",
       " 'Ghostbusters (a.k.a. Ghost Busters) (1984)': '3.5',\n",
       " '\"American Werewolf in London': '4.0',\n",
       " '\"Birds': '4.0',\n",
       " '\"Iron Giant': '3.0',\n",
       " '\"Sixth Sense': '4.0',\n",
       " '\"Christmas Story': '3.5',\n",
       " 'Invasion of the Body Snatchers (1978)': '3.5',\n",
       " '\"O Brother': '4.0',\n",
       " 'Excalibur (1981)': '4.0',\n",
       " 'Misery (1990)': '4.0',\n",
       " \"Ferris Bueller's Day Off (1986)\": '3.5',\n",
       " 'Hook (1991)': '4.0',\n",
       " 'Blade Runner (1982)': '4.0',\n",
       " '\"Shawshank Redemption': '4.0',\n",
       " 'Enter the Dragon (1973)': '3.0',\n",
       " '\"Dirty Dozen': '4.0',\n",
       " 'Goldfinger (1964)': '3.5',\n",
       " 'Ladyhawke (1985)': '4.0',\n",
       " 'Fight Club (1999)': '4.0',\n",
       " 'Jabberwocky (1977)': '4.0',\n",
       " 'Dragonheart (1996)': '3.0',\n",
       " 'Jumanji (1995)': '3.5',\n",
       " 'Star Wars: Episode V - The Empire Strikes Back (1980)': '4.5',\n",
       " '\"Usual Suspects': '3.5',\n",
       " 'Van Helsing (2004)': '4.0',\n",
       " '\"Good': '3.0',\n",
       " '28 Days Later (2002)': '3.5',\n",
       " 'Dune (1984)': '4.0',\n",
       " '\"Wicker Man': '4.0',\n",
       " \"What's Eating Gilbert Grape (1993)\": '3.5',\n",
       " '\"Lord of the Rings: The Fellowship of the Ring': '5.0',\n",
       " 'Godsend (2004)': '3.5',\n",
       " 'Apocalypse Now (1979)': '3.5',\n",
       " \"Bill & Ted's Excellent Adventure (1989)\": '4.0',\n",
       " 'Pulp Fiction (1994)': '4.0',\n",
       " '\"Shining': '4.0',\n",
       " '\"Exorcist': '3.5',\n",
       " 'Poltergeist (1982)': '3.5',\n",
       " 'Star Kid (1997)': '3.5',\n",
       " 'Chitty Chitty Bang Bang (1968)': '3.5',\n",
       " '\"Femme Nikita': '4.0',\n",
       " '\"Others': '3.5',\n",
       " 'Clerks (1994)': '4.0',\n",
       " 'Full Metal Jacket (1987)': '3.5',\n",
       " 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)': '4.0',\n",
       " '\"Terminator': '4.0',\n",
       " '\"Borrowers': '3.5',\n",
       " 'Psycho (1960)': '4.0',\n",
       " 'Stand by Me (1986)': '4.0',\n",
       " 'Ran (1985)': '3.5',\n",
       " 'Army of Darkness (1993)': '4.0',\n",
       " 'Rosencrantz and Guildenstern Are Dead (1990)': '3.0',\n",
       " 'Heavy Metal 2000 (2000)': '3.5',\n",
       " 'Labyrinth (1986)': '4.0',\n",
       " 'Dead Poets Society (1989)': '3.5',\n",
       " 'Alien (1979)': '4.0',\n",
       " \"Bill & Ted's Bogus Journey (1991)\": '3.5',\n",
       " 'Aliens (1986)': '4.0',\n",
       " 'Small Soldiers (1998)': '3.5',\n",
       " '\"Great Escape': '3.5',\n",
       " 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)': '4.5',\n",
       " '\"Lord of the Rings: The Two Towers': '5.0',\n",
       " 'Slaughterhouse-Five (1972)': '3.5',\n",
       " '\"Last Unicorn': '4.0',\n",
       " 'Unforgiven (1992)': '4.0',\n",
       " '\"Adventures of Baron Munchausen': '4.0',\n",
       " 'Evil Dead II (Dead by Dawn) (1987)': '3.5',\n",
       " 'Splash (1984)': '4.0',\n",
       " 'Harry Potter and the Prisoner of Azkaban (2004)': '4.0',\n",
       " 'Young Frankenstein (1974)': '4.0',\n",
       " 'Rob Roy (1995)': '4.0'}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['1']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "下面计算两位用户之间的相似度。首先需要找到两位用户之间共同评论的电影，在使用欧氏距离公式计算距离，最后计算两位用户之间的相似度。 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from math import *\n",
    "\n",
    "def Euclidean(user1, user2):\n",
    "    # 两位用户交集\n",
    "    user1_data = data[user1]\n",
    "    user2_data = data[user2]\n",
    "    distance = 0\n",
    "    for key in user1_data.keys():\n",
    "        if key in user2_data.keys():\n",
    "            distance += pow(float(user1_data[key]) - float(user2_data[key]), 2)\n",
    "    # 值越小，相似度越高\n",
    "    return 1/(1+sqrt(distance))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.20658711810431302"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Euclidean('1', '2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.27792629762666365"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Euclidean('1', '4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2 协同过滤推荐算法的实现\n",
    "\n",
    "公式待定。。。。\n",
    "\n",
    "Pearson相关系数可以用来衡量两个变量之间的线性关系，它的值范围[-1, 1], 1: 完全线性相关，0: 完全不线性相关，-1: 完全负相关。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 计算两位用户之间的Pearson相关系数\n",
    "def pearson_sim(user1, user2):\n",
    "    user1_data=data[user1]\n",
    "    user2_data=data[user2]\n",
    "    distance=0\n",
    "    common={}\n",
    "    #找到都评价过的电影                                                                                                                                                                                       \n",
    "    for key in user1_data:\n",
    "        if key in user2_data:\n",
    "            common[key]=1\n",
    "    #如果没有共同的电影，返回0                                                                                                                                                                              \n",
    "    if len(common)==0:\n",
    "        return 0\n",
    "     #计算电影数目                                                                                                                                                                                              \n",
    "    n=len(common)\n",
    "\n",
    "   #计算评分和                                                                                                                                                                                               \n",
    "    sum1 = sum([float(user1_data[movie]) for movie in common])\n",
    "    sum2 = sum([float(user2_data[movie]) for movie in common])\n",
    "    #计算评分平方和                                                                                                                                                                                                 \n",
    "    sum1Sq=sum([pow(float(user1_data[movie]),2) for movie in common])\n",
    "    sum2Sq=sum([pow(float(user2_data[movie]), 2) for movie in common])\n",
    "    # 计算乘积和                                                                                                                                                                                              \n",
    "    pSum=sum([float(user1_data[it])*float(user2_data[it]) for it in common])\n",
    "\n",
    "    # 计算Pearson系数                                                                                                                                                                                         \n",
    "    num=pSum-(sum1*sum2/n)\n",
    "    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))\n",
    "    if den==0:\n",
    "        return 0\n",
    "    r=num/den\n",
    "    return r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.06189844605901989"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pearson_sim('1', '2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pearson_sim('1', '4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
